{
  "nbformat": 4,
  "nbformat_minor": 2,
  "metadata": {},
  "cells": [
    {
      "metadata": {},
      "source": [
        "<td>\n",
        "   <a target=\"_blank\" href=\"https://labelbox.com\" ><img src=\"https://labelbox.com/blog/content/images/2021/02/logo-v4.svg\" width=256/></a>\n",
        "</td>"
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "<td>\n",
        "<a href=\"https://colab.research.google.com/github/Labelbox/labelbox-python/blob/master/examples/prediction_upload/pdf_predictions.ipynb\" target=\"_blank\"><img\n",
        "src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"></a>\n",
        "</td>\n",
        "\n",
        "<td>\n",
        "<a href=\"https://github.com/Labelbox/labelbox-python/tree/master/examples/prediction_upload/pdf_predictions.ipynb\" target=\"_blank\"><img\n",
        "src=\"https://img.shields.io/badge/GitHub-100000?logo=github&logoColor=white\" alt=\"GitHub\"></a>\n",
        "</td>"
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "# PDF Prediction Import "
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "*Annotation types*\n",
        "- Checklist classification (including nested classifications)\n",
        "- Radio classifications (including nested classifications)\n",
        "- Free text classifications\n",
        "- Bounding box\n",
        "- Entities\n",
        "- Relationships (only supported for MAL imports)\n",
        "\n",
        "\n",
        "*NDJson*\n",
        "- Checklist classification (including nested classifications)\n",
        "- Radio classifications (including nested classifications)\n",
        "- Free text classifications\n",
        "- Bounding box \n",
        "- Entities \n",
        "- Relationships (only supported for MAL imports)"
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "## Setup"
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        " !pip install -q \"labelbox[data]\""
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "import uuid\n",
        "import json\n",
        "import requests\n",
        "import labelbox as lb\n",
        "import labelbox.types as lb_types"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "## Replace with your API key"
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "API_KEY= \"\"\n",
        "client = lb.Client(API_KEY)"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "## Supported Predictions"
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "########## Entity ##########\n",
        "\n",
        "# Annotation Types\n",
        "entities_prediction = lb_types.ObjectAnnotation(\n",
        "    name=\"named_entity\",\n",
        "    confidence=0.5,\n",
        "    value= lb_types.DocumentEntity(\n",
        "        name=\"named_entity\",\n",
        "        textSelections=[\n",
        "            lb_types.DocumentTextSelection(\n",
        "                token_ids=[],\n",
        "                group_id=\"\",\n",
        "                page=1\n",
        "            )\n",
        "        ]\n",
        "    )\n",
        ")\n",
        "\n",
        "# NDJSON\n",
        "entities_prediction_ndjson = {\n",
        "    \"name\": \"named_entity\",\n",
        "    \"confidence\": 0.5,\n",
        "    \"textSelections\": [\n",
        "        {\n",
        "            \"tokenIds\": [\n",
        "                \"<UUID>\",\n",
        "            ],\n",
        "            \"groupId\": \"<UUID>\",\n",
        "            \"page\": 1,\n",
        "        }\n",
        "    ]\n",
        "}"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "########### Radio Classification #########\n",
        "\n",
        "# Annotation types\n",
        "radio_prediction = lb_types.ClassificationAnnotation(\n",
        "    name=\"radio_question\",\n",
        "    value=lb_types.Radio(answer =\n",
        "        lb_types.ClassificationAnswer(name = \"first_radio_answer\", confidence=0.5)\n",
        "    )\n",
        ")\n",
        "# NDJSON\n",
        "radio_prediction_ndjson = {\n",
        "  \"name\": \"radio_question\",\n",
        "  \"answer\": {\"name\": \"first_radio_answer\", \"confidence\":0.5}\n",
        "}"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "############ Checklist Classification ###########\n",
        "\n",
        "# Annotation types\n",
        "checklist_prediction = lb_types.ClassificationAnnotation(\n",
        "    name=\"checklist_question\",\n",
        "    value=lb_types.Checklist(answer = [\n",
        "        lb_types.ClassificationAnswer(name = \"first_checklist_answer\", confidence=0.5),\n",
        "        lb_types.ClassificationAnswer(name = \"second_checklist_answer\", confidence=0.5)\n",
        "    ])\n",
        "  )\n",
        "\n",
        "\n",
        "# NDJSON\n",
        "checklist_prediction_ndjson = {\n",
        "  \"name\": \"checklist_question\",\n",
        "  \"answer\": [\n",
        "    {\"name\": \"first_checklist_answer\", \"confidence\":0.5},\n",
        "    {\"name\": \"second_checklist_answer\", \"confidence\":0.5}\n",
        "  ]\n",
        "}"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "############ Bounding Box ###########\n",
        "\n",
        "bbox_dim_1 = {\n",
        "            \"top\": 135.3,\n",
        "            \"left\": 102.771,\n",
        "            \"height\": 109.843,\n",
        "            \"width\": 415.8\n",
        "      }\n",
        "bbox_prediction = lb_types.ObjectAnnotation(\n",
        "    name=\"bounding_box\",  # must match your ontology feature\"s name\n",
        "    value=lb_types.DocumentRectangle(\n",
        "        start=lb_types.Point(x=bbox_dim_1[\"left\"], y=bbox_dim_1[\"top\"]),  # x = left, y = top\n",
        "        end=lb_types.Point(x=bbox_dim_1[\"left\"] + bbox_dim_1[\"width\"], y=bbox_dim_1[\"top\"]+ bbox_dim_1[\"height\"]),  # x= left + width , y = top + height\n",
        "        page=0,\n",
        "        unit=lb_types.RectangleUnit.POINTS\n",
        "        )\n",
        "    )\n",
        "\n",
        "bbox_prediction_ndjson = {\n",
        "  \"name\": \"bounding_box\",\n",
        "  \"bbox\": bbox_dim_1,\n",
        "  \"page\": 0,\n",
        "  \"unit\": \"POINTS\"\n",
        "}"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "# ############ global nested classifications ###########\n",
        "\n",
        "nested_checklist_prediction = lb_types.ClassificationAnnotation(\n",
        "  name=\"nested_checklist_question\",\n",
        "  value=lb_types.Checklist(\n",
        "    answer=[lb_types.ClassificationAnswer(\n",
        "      name=\"first_checklist_answer\",\n",
        "      confidence=0.5, # Confidence scores should be added to the answer\n",
        "      classifications=[\n",
        "        lb_types.ClassificationAnnotation(\n",
        "          name=\"sub_checklist_question\",\n",
        "          value=lb_types.Checklist(\n",
        "            answer=[lb_types.ClassificationAnswer(\n",
        "            name=\"first_sub_checklist_answer\",\n",
        "            confidence=0.5 # Confidence scores should be added to the answer\n",
        "          )]\n",
        "        ))\n",
        "      ]\n",
        "    )]\n",
        "  )\n",
        ")\n",
        "\n",
        "nested_checklist_prediction_ndjson = {\n",
        "  \"name\": \"nested_checklist_question\",\n",
        "  \"answer\": [{\n",
        "      \"name\": \"first_checklist_answer\",\n",
        "      \"confidence\": 0.5, # Confidence scores should be added to the answer\n",
        "      \"classifications\" : [\n",
        "        {\n",
        "          \"name\": \"sub_checklist_question\",\n",
        "          \"answer\": {\n",
        "            \"name\": \"first_sub_checklist_answer\",\n",
        "            \"confidence\": 0.5, # Confidence scores should be added to the answer\n",
        "          }\n",
        "        }\n",
        "      ]\n",
        "  }]\n",
        "}\n",
        "\n",
        "nested_radio_prediction = lb_types.ClassificationAnnotation(\n",
        "  name=\"nested_radio_question\",\n",
        "  value=lb_types.Radio(\n",
        "    answer=lb_types.ClassificationAnswer(\n",
        "      name=\"first_radio_answer\",\n",
        "      confidence=0.5, # Confidence scores should be added to the answer\n",
        "      classifications=[\n",
        "        lb_types.ClassificationAnnotation(\n",
        "          name=\"sub_radio_question\",\n",
        "          value=lb_types.Radio(\n",
        "            answer=lb_types.ClassificationAnswer(\n",
        "              name=\"first_sub_radio_answer\",\n",
        "              confidence=0.5 # Confidence scores should be added to the answer\n",
        "            )\n",
        "          )\n",
        "        )\n",
        "      ]\n",
        "    )\n",
        "  )\n",
        ")\n",
        "\n",
        "\n",
        "nested_radio_prediction_ndjson = {\n",
        "  \"name\": \"nested_radio_question\",\n",
        "  \"answer\": {\n",
        "      \"name\": \"first_radio_answer\",\n",
        "      \"confidence\": 0.5,\n",
        "      \"classifications\": [{\n",
        "          \"name\":\"sub_radio_question\",\n",
        "          \"answer\": { \"name\" : \"first_sub_radio_answer\",\n",
        "                     \"confidence\": 0.5}\n",
        "        }]\n",
        "    }\n",
        "}\n",
        "\n"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "############## Classification Free-form text ##############\n",
        "\n",
        "text_prediction = lb_types.ClassificationAnnotation(\n",
        "  name=\"free_text\",  # must match your ontology feature\"s name\n",
        "  value=lb_types.Text(answer=\"sample text\", confidence=0.5)\n",
        ")\n",
        "\n",
        "\n",
        "text_prediction_ndjson = {\n",
        "  \"name\": \"free_text\",\n",
        "  \"answer\": \"sample text\",\n",
        "  \"confidence\": 0.5\n",
        "}"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "######### BBOX with nested classifications #########\n",
        "\n",
        "bbox_dim = {\n",
        "        \"top\": 226.757,\n",
        "        \"left\": 317.271,\n",
        "        \"height\": 194.229,\n",
        "        \"width\": 249.386\n",
        "    }\n",
        "\n",
        "bbox_with_radio_subclass_prediction = lb_types.ObjectAnnotation(\n",
        "    name=\"bbox_with_radio_subclass\",\n",
        "    confidence=0.5,\n",
        "    value=lb_types.DocumentRectangle(\n",
        "        start=lb_types.Point(x=bbox_dim[\"left\"], y=bbox_dim[\"top\"]), # x = left, y = top\n",
        "        end=lb_types.Point(x=bbox_dim[\"left\"] + bbox_dim[\"width\"], y=bbox_dim[\"top\"] + bbox_dim[\"height\"]), # x= left + width , y = top + height\n",
        "        unit=lb_types.RectangleUnit.POINTS,\n",
        "        page=1\n",
        "    ),\n",
        "    classifications=[\n",
        "    \tlb_types.ClassificationAnnotation(\n",
        "        \tname=\"sub_radio_question\",\n",
        "      \t\tvalue=lb_types.Radio(\n",
        "          answer=lb_types.ClassificationAnswer(\n",
        "            name=\"first_sub_radio_answer\",\n",
        "            confidence=0.5,\n",
        "            classifications=[\n",
        "              lb_types.ClassificationAnnotation(\n",
        "                name=\"second_sub_radio_question\",\n",
        "                value=lb_types.Radio(\n",
        "                  answer=lb_types.ClassificationAnswer(\n",
        "                    name=\"second_sub_radio_answer\",\n",
        "                    confidence=0.5\n",
        "                  )\n",
        "                )\n",
        "              )\n",
        "            ]\n",
        "          )\n",
        "          )\n",
        "        )\n",
        "    ]\n",
        ")\n",
        "\n",
        "bbox_with_radio_subclass_prediction_ndjson = {\n",
        "  \"name\": \"bbox_with_radio_subclass\",\n",
        "  \"classifications\": [\n",
        "    {\n",
        "      \"name\": \"sub_radio_question\",\n",
        "      \"answer\": {\n",
        "          \"name\": \"first_sub_radio_answer\",\n",
        "          \"confidence\": 0.5,\n",
        "          \"classifications\": [\n",
        "              {\n",
        "                  \"name\": \"second_sub_radio_question\",\n",
        "                  \"answer\": {\n",
        "                      \"name\": \"second_sub_radio_answer\", \"confidence\": 0.5}\n",
        "               }\n",
        "            ]\n",
        "        }\n",
        "    }\n",
        "  ],\n",
        "  \"bbox\":bbox_dim,\n",
        "  \"page\": 1,\n",
        "  \"unit\": \"POINTS\"\n",
        "}"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "\n",
        "############ NER with nested classifications ########\n",
        "\n",
        "\n",
        "ner_with_checklist_subclass_prediction = lb_types.ObjectAnnotation(\n",
        "  name=\"ner_with_checklist_subclass\",\n",
        " confidence=0.5,\n",
        "  value=lb_types.DocumentEntity(\n",
        "    name=\"ner_with_checklist_subclass\",\n",
        "    text_selections=[\n",
        "      lb_types.DocumentTextSelection(\n",
        "        token_ids=[],\n",
        "        group_id=\"\",\n",
        "        page=1\n",
        "      )\n",
        "    ]\n",
        "  ),\n",
        "  classifications=[\n",
        "    lb_types.ClassificationAnnotation(\n",
        "      name=\"sub_checklist_question\",\n",
        "      value=lb_types.Checklist(\n",
        "      answer=[lb_types.ClassificationAnswer(name=\"first_sub_checklist_answer\", confidence=0.5)]\n",
        "      )\n",
        "    )\n",
        "  ]\n",
        ")\n",
        "\n",
        "\n",
        "ner_with_checklist_subclass_prediction_ndjson = {\n",
        "  \"name\": \"ner_with_checklist_subclass\",\n",
        "  \"classifications\":[\n",
        "    {\n",
        "      \"name\": \"sub_checklist_question\",\n",
        "      \"answer\": [{\"name\": \"first_sub_checklist_answer\", \"confidence\":0.5 }]\n",
        "    }\n",
        "  ],\n",
        "  \"textSelections\": [\n",
        "      {\n",
        "          \"tokenIds\": [\n",
        "              \"<UUID>\"\n",
        "          ],\n",
        "          \"groupId\": \"<UUID>\",\n",
        "          \"page\": 1\n",
        "      }\n",
        "  ]\n",
        "}\n",
        "\n"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "## Step 1: Import data rows into Catalog "
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "Passing a `text_layer_url` is not longer required. Labelbox automatically generates a text layer using Google Document AI and its OCR engine to detect tokens. \n",
        "\n",
        "However, it's important to note that Google Document AI imposes specific restrictions on document size:\n",
        "- The document must have no more than 15 pages.\n",
        "- The file size should not exceed 20 MB.\n",
        "\n",
        "Furthermore, Google Document AI optimizes documents before OCR processing. This optimization might include rotating images or pages to ensure that text appears horizontally. Consequently, token coordinates are calculated based on the rotated/optimized images, resulting in potential discrepancies with the original PDF document.\n",
        "\n",
        "For example, in a landscape-oriented PDF, the document is rotated by 90 degrees before processing. As a result, all tokens in the text layer are also rotated by 90 degrees.\n",
        "\n",
        "You may still pass a `text_layer_url` if you wish to bypass the automatic text layer generation"
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "global_key = \"0801.3483.pdf\"\n",
        "img_url = {\n",
        "    \"row_data\": {\n",
        "      \"pdf_url\": \"https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483.pdf\"\n",
        "    },\n",
        "    \"global_key\": global_key\n",
        "}\n",
        "\n",
        "\n",
        "dataset = client.create_dataset(name=\"pdf_demo_dataset\")\n",
        "task = dataset.create_data_rows([img_url])\n",
        "task.wait_till_done()\n",
        "print(f\"Failed data rows: {task.failed_data_rows}\")\n",
        "print(f\"Errors: {task.errors}\")\n",
        "\n",
        "if task.errors:\n",
        "    for error in task.errors:\n",
        "        if 'Duplicate global key' in error['message'] and dataset.row_count == 0:\n",
        "            # If the global key already  exists in the workspace the dataset will be created empty, so we can delete it.\n",
        "            print(f\"Deleting empty dataset: {dataset}\")\n",
        "            dataset.delete()"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "## Step 2: Create/select an Ontology for your model predictions\n",
        "Your project should have the correct ontology setup with all the tools and classifications supported for your annotations, and the tool names and classification instructions should match the name/instructions fields in your annotations to ensure the correct feature schemas are matched."
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "## Setup the ontology and link the tools created above.\n",
        "\n",
        "ontology_builder = lb.OntologyBuilder(\n",
        "  classifications=[ # List of Classification objects\n",
        "    lb.Classification(\n",
        "      class_type=lb.Classification.Type.RADIO,\n",
        "      name=\"radio_question\",\n",
        "      scope = lb.Classification.Scope.GLOBAL,\n",
        "      options=[\n",
        "        lb.Option(value=\"first_radio_answer\"),\n",
        "        lb.Option(value=\"second_radio_answer\")\n",
        "      ]\n",
        "    ),\n",
        "    lb.Classification(\n",
        "      class_type=lb.Classification.Type.CHECKLIST,\n",
        "      name=\"checklist_question\",\n",
        "      scope = lb.Classification.Scope.GLOBAL,\n",
        "      options=[\n",
        "        lb.Option(value=\"first_checklist_answer\"),\n",
        "        lb.Option(value=\"second_checklist_answer\")\n",
        "      ]\n",
        "    ),\n",
        "    lb.Classification(\n",
        "      class_type=lb.Classification.Type.TEXT,\n",
        "      name=\"free_text\",\n",
        "      scope = lb.Classification.Scope.GLOBAL\n",
        "    ),\n",
        "    lb.Classification(\n",
        "        class_type=lb.Classification.Type.RADIO,\n",
        "        name=\"nested_radio_question\",\n",
        "        scope = lb.Classification.Scope.GLOBAL,\n",
        "        options=[\n",
        "            lb.Option(\"first_radio_answer\",\n",
        "                options=[\n",
        "                    lb.Classification(\n",
        "                        class_type=lb.Classification.Type.RADIO,\n",
        "                        name=\"sub_radio_question\",\n",
        "                        options=[lb.Option(\"first_sub_radio_answer\")]\n",
        "                    )\n",
        "                ])\n",
        "          ]\n",
        "    ),\n",
        "    lb.Classification(\n",
        "      class_type=lb.Classification.Type.CHECKLIST,\n",
        "      name=\"nested_checklist_question\",\n",
        "      scope = lb.Classification.Scope.GLOBAL,\n",
        "      options=[\n",
        "          lb.Option(\"first_checklist_answer\",\n",
        "            options=[\n",
        "              lb.Classification(\n",
        "                  class_type=lb.Classification.Type.CHECKLIST,\n",
        "                  name=\"sub_checklist_question\",\n",
        "                  options=[lb.Option(\"first_sub_checklist_answer\")]\n",
        "              )\n",
        "          ])\n",
        "      ]\n",
        "    ),\n",
        "  ],\n",
        "  tools=[ # List of Tool objects\n",
        "    lb.Tool( tool=lb.Tool.Type.BBOX,name=\"bounding_box\"),\n",
        "    lb.Tool(tool=lb.Tool.Type.NER, name=\"named_entity\"),\n",
        "    lb.Tool(tool=lb.Tool.Type.NER,\n",
        "            name=\"ner_with_checklist_subclass\",\n",
        "            classifications=[\n",
        "              lb.Classification(\n",
        "                class_type=lb.Classification.Type.CHECKLIST,\n",
        "                name=\"sub_checklist_question\",\n",
        "                options=[\n",
        "                  lb.Option(value=\"first_sub_checklist_answer\")\n",
        "                ]\n",
        "              )\n",
        "          ]),\n",
        "    lb.Tool( tool=lb.Tool.Type.BBOX,\n",
        "            name=\"bbox_with_radio_subclass\",\n",
        "            classifications=[\n",
        "              lb.Classification(\n",
        "                  class_type=lb.Classification.Type.RADIO,\n",
        "                  name=\"sub_radio_question\",\n",
        "                  options=[\n",
        "                    lb.Option(\n",
        "                      value=\"first_sub_radio_answer\" ,\n",
        "                      options=[\n",
        "                        lb.Classification(\n",
        "                          class_type=lb.Classification.Type.RADIO,\n",
        "                          name=\"second_sub_radio_question\",\n",
        "                          options=[lb.Option(\"second_sub_radio_answer\")]\n",
        "                        )]\n",
        "                    )]\n",
        "                )]\n",
        "      )]\n",
        ")\n",
        "\n",
        "ontology = client.create_ontology(\"Document Annotation Import Demo\",\n",
        "                                  ontology_builder.asdict(),\n",
        "                                  media_type=lb.MediaType.Document)"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "## Step 3: Create a Model and Model Run"
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "# create Model\n",
        "model = client.create_model(name=\"PDF_model_run_\"+ str(uuid.uuid4()),\n",
        "                            ontology_id=ontology.uid)\n",
        "# create Model Run\n",
        "model_run = model.create_model_run(\"iteration 1\")"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "## Step 4: Send data rows to the Model Run "
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "model_run.upsert_data_rows(global_keys=[global_key])"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "## Step 5: Create the predictions payload\n",
        "Create the prediction payload using the snippets of code in the **Supported Predcitions** section\n",
        "\n",
        "Labelbox support two formats for the annotations payload: NDJSON and Python Annotation types. Both are described below to compose your annotations into Labels attached to the data rows.\n",
        "\n",
        "The resulting payload should have exactly the same content for annotations that are supported by both"
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "To import ner annotations, you must pass a `text_layer_url`, Labelbox automatically generates a `text_layer_url` after importing a pdf asset that doesn't include a `text_layer_url`"
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "To extract the generated text layer url we first need to export the data row"
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "client.enable_experimental = True\n",
        "task = lb.DataRow.export(client=client,global_keys=[global_key])\n",
        "task.wait_till_done()\n",
        "stream = task.get_stream()\n",
        "\n",
        "text_layer = \"\"\n",
        "for output in stream:\n",
        "    output_json = json.loads(output.json_str)\n",
        "    text_layer = output_json['media_attributes']['text_layer_url']\n",
        "print(text_layer)"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "# Helper method\n",
        "def update_text_selections(annotation, group_id, list_tokens, page):\n",
        "  return annotation.update({\n",
        "    \"textSelections\": [\n",
        "      {\n",
        "        \"groupId\": group_id,\n",
        "        \"tokenIds\": list_tokens,\n",
        "        \"page\": page\n",
        "      }\n",
        "    ]\n",
        "  })\n",
        "\n",
        "\n",
        "# Fetch the content of the text layer\n",
        "res = requests.get(text_layer)\n",
        "\n",
        "# Phrases that we want to annotation obtained from the text layer url\n",
        "content_phrases = [\"Metal-insulator (MI) transitions have been one of the\",\n",
        "                   \"T. Sasaki, N. Yoneyama, and N. Kobayashi\"]\n",
        "\n",
        "# Parse the text layer\n",
        "text_selections = []\n",
        "text_selections_ner = []\n",
        "\n",
        "\n",
        "for obj in json.loads(res.text):\n",
        "  for group in obj[\"groups\"]:\n",
        "    if group[\"content\"] == content_phrases[0]:\n",
        "      list_tokens = [x[\"id\"] for x in group[\"tokens\"]]\n",
        "      # build text selections for Python Annotation Types\n",
        "      document_text_selection = lb_types.DocumentTextSelection(groupId=group[\"id\"], tokenIds=list_tokens, page=1)\n",
        "      text_selections.append(document_text_selection)\n",
        "      # build text selection for the NDJson annotations\n",
        "      update_text_selections(annotation=entities_prediction_ndjson,\n",
        "                             group_id=group[\"id\"], # id representing group of words\n",
        "                             list_tokens=list_tokens, # ids representing individual words from the group\n",
        "                             page=1)\n",
        "    if group[\"content\"] == content_phrases[1]:\n",
        "      list_tokens_2 = [x[\"id\"] for x in group[\"tokens\"]]\n",
        "      # build text selections for Python Annotation Types\n",
        "      ner_text_selection = lb_types.DocumentTextSelection(groupId=group[\"id\"], tokenIds=list_tokens_2, page=1)\n",
        "      text_selections_ner.append(ner_text_selection)\n",
        "      # build text selection for the NDJson annotations\n",
        "      update_text_selections(annotation=ner_with_checklist_subclass_prediction_ndjson,\n",
        "                             group_id=group[\"id\"], # id representing group of words\n",
        "                             list_tokens=list_tokens_2, # ids representing individual words from the group\n",
        "                             page=1)\n",
        "\n",
        "\n",
        "#re-write the entity annotation with text selections\n",
        "entities_prediction_document_entity = lb_types.DocumentEntity(name=\"named_entity\",confidence=0.5, textSelections = text_selections)\n",
        "entities_prediction = lb_types.ObjectAnnotation(name=\"named_entity\",value=entities_prediction_document_entity)\n",
        "\n",
        "\n",
        "# re-write the entity annotation + subclassification with text selections\n",
        "classifications = [\n",
        "    lb_types.ClassificationAnnotation(\n",
        "      name=\"sub_checklist_question\",\n",
        "      value=lb_types.Checklist(\n",
        "      answer=[lb_types.ClassificationAnswer(name=\"first_sub_checklist_answer\", confidence=0.5)]\n",
        "      )\n",
        "    )\n",
        "  ]\n",
        "ner_annotation_with_subclass = lb_types.DocumentEntity(name=\"ner_with_checklist_subclass\",confidence=0.5, textSelections= text_selections_ner)\n",
        "ner_with_checklist_subclass_annotation = lb_types.ObjectAnnotation(name=\"ner_with_checklist_subclass\",\n",
        "                                                                   confidence=0.5,\n",
        "                                                                   value=ner_annotation_with_subclass,\n",
        "                                                                   classifications=classifications)\n",
        "\n",
        "# Final NDJSON and python annotations\n",
        "print(f\"entities_annotations_ndjson={entities_prediction_ndjson}\")\n",
        "print(f\"entities_annotation={entities_prediction}\")\n",
        "print(f\"nested_entities_annotation_ndjson={ner_with_checklist_subclass_prediction_ndjson}\")\n",
        "print(f\"nested_entities_annotation={ner_with_checklist_subclass_annotation}\")"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "Python annotation \n"
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "label_predictions = []\n",
        "\n",
        "label_predictions.append(\n",
        "    lb_types.Label(\n",
        "        data=lb_types.DocumentData(\n",
        "            global_key=global_key),\n",
        "        annotations = [\n",
        "            entities_prediction,\n",
        "            checklist_prediction,\n",
        "            nested_checklist_prediction,\n",
        "            text_prediction,\n",
        "            radio_prediction,\n",
        "            nested_radio_prediction,\n",
        "            bbox_prediction,\n",
        "            bbox_with_radio_subclass_prediction,\n",
        "            ner_with_checklist_subclass_prediction\n",
        "        ]\n",
        "  )\n",
        ")"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "If using NDJSON: "
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "\n",
        "label_predictions_ndjson = []\n",
        "for annot in [\n",
        "    entities_prediction_ndjson,\n",
        "    checklist_prediction_ndjson,\n",
        "    nested_checklist_prediction_ndjson,\n",
        "    text_prediction_ndjson,\n",
        "    radio_prediction_ndjson,\n",
        "    nested_radio_prediction_ndjson,\n",
        "    bbox_prediction_ndjson,\n",
        "    bbox_with_radio_subclass_prediction_ndjson,\n",
        "    ner_with_checklist_subclass_prediction_ndjson\n",
        "  ]:\n",
        "  annot.update({\n",
        "      \"dataRow\": {\"globalKey\": global_key},\n",
        "  })\n",
        "  label_predictions_ndjson.append(annot)\n",
        "\n"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "## Step 6: Upload the predictions payload to the Model Run"
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "# Upload the prediction label to the Model Run\n",
        "upload_job_prediction = model_run.add_predictions(\n",
        "    name=\"prediction_upload_job\"+str(uuid.uuid4()),\n",
        "    predictions=label_predictions)\n",
        "\n",
        "# Errors will appear for annotation uploads that failed.\n",
        "print(\"Errors:\", upload_job_prediction.errors)\n",
        "print(\"Status of uploads: \", upload_job_prediction.statuses)"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "## Step 7: Send annotations to the Model Run\n",
        "To send annotations to a Model Run, we must first import them into a project, create a label payload and then send them to the Model Run."
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "7.1 Create a labelbox project \n"
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "project = client.create_project(name=\"Document Prediction Import Demo\",\n",
        "                                    media_type=lb.MediaType.Document)\n",
        "project.setup_editor(ontology)"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "7.2 Create a batch to send to the project "
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "project.create_batch(\n",
        "  \"batch_text_prediction_demo\", # Each batch in a project must have a unique name\n",
        "  global_keys=[global_key], # Paginated collection of data row objects, list of data row ids or global keys\n",
        "  priority=5 # priority between 1(Highest) - 5(lowest)\n",
        ")"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "7.3 Create the annotations payload"
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "entities_annotation = lb_types.ObjectAnnotation(\n",
        "    name=\"named_entity\",\n",
        "    value= lb_types.DocumentEntity(\n",
        "        name=\"named_entity\",\n",
        "        textSelections=text_selections\n",
        "    )\n",
        ")\n",
        "\n",
        "radio_annotation = lb_types.ClassificationAnnotation(\n",
        "    name=\"radio_question\",\n",
        "    value=lb_types.Radio(answer =\n",
        "        lb_types.ClassificationAnswer(name = \"first_radio_answer\")\n",
        "    )\n",
        ")\n",
        "\n",
        "checklist_annotation = lb_types.ClassificationAnnotation(\n",
        "    name=\"checklist_question\",\n",
        "    value=lb_types.Checklist(answer = [\n",
        "        lb_types.ClassificationAnswer(name = \"first_checklist_answer\"),\n",
        "        lb_types.ClassificationAnswer(name = \"second_checklist_answer\"),\n",
        "    ])\n",
        "  )\n",
        "\n",
        "bbox_dim_1 = {\n",
        "            \"top\": 135.3,\n",
        "            \"left\": 102.771,\n",
        "            \"height\": 109.843,\n",
        "            \"width\": 415.8\n",
        "      }\n",
        "bbox_annotation = lb_types.ObjectAnnotation(\n",
        "    name=\"bounding_box\",  # must match your ontology feature\"s name\n",
        "    value=lb_types.DocumentRectangle(\n",
        "        start=lb_types.Point(x=bbox_dim_1[\"left\"], y=bbox_dim_1[\"top\"]),  # x = left, y = top\n",
        "        end=lb_types.Point(x=bbox_dim_1[\"left\"] + bbox_dim_1[\"width\"], y=bbox_dim_1[\"top\"]+ bbox_dim_1[\"height\"]),  # x= left + width , y = top + height\n",
        "        page=0,\n",
        "        unit=lb_types.RectangleUnit.POINTS\n",
        "        )\n",
        "    )\n",
        "\n",
        "nested_checklist_annotation = lb_types.ClassificationAnnotation(\n",
        "  name=\"nested_checklist_question\",\n",
        "  value=lb_types.Checklist(\n",
        "    answer=[lb_types.ClassificationAnswer(\n",
        "      name=\"first_checklist_answer\",\n",
        "      classifications=[\n",
        "        lb_types.ClassificationAnnotation(\n",
        "          name=\"sub_checklist_question\",\n",
        "          value=lb_types.Checklist(\n",
        "            answer=[lb_types.ClassificationAnswer(\n",
        "            name=\"first_sub_checklist_answer\",\n",
        "          )]\n",
        "        ))\n",
        "      ]\n",
        "    )]\n",
        "  )\n",
        ")\n",
        "\n",
        "nested_radio_annotation = lb_types.ClassificationAnnotation(\n",
        "  name=\"nested_radio_question\",\n",
        "  value=lb_types.Radio(\n",
        "    answer=lb_types.ClassificationAnswer(\n",
        "      name=\"first_radio_answer\",\n",
        "      classifications=[\n",
        "        lb_types.ClassificationAnnotation(\n",
        "          name=\"sub_radio_question\",\n",
        "          value=lb_types.Radio(\n",
        "            answer=lb_types.ClassificationAnswer(\n",
        "              name=\"first_sub_radio_answer\",\n",
        "            )\n",
        "          )\n",
        "        )\n",
        "      ]\n",
        "    )\n",
        "  )\n",
        ")\n",
        "\n",
        "text_annotation = lb_types.ClassificationAnnotation(\n",
        "  name=\"free_text\",\n",
        "  value=lb_types.Text(answer=\"sample text\")\n",
        ")\n",
        "\n",
        "bbox_dim = {\n",
        "        \"top\": 226.757,\n",
        "        \"left\": 317.271,\n",
        "        \"height\": 194.229,\n",
        "        \"width\": 249.386\n",
        "    }\n",
        "\n",
        "bbox_with_radio_subclass_annotation = lb_types.ObjectAnnotation(\n",
        "    name=\"bbox_with_radio_subclass\",\n",
        "\n",
        "    value=lb_types.DocumentRectangle(\n",
        "        start=lb_types.Point(x=bbox_dim[\"left\"], y=bbox_dim[\"top\"]), # x = left, y = top\n",
        "        end=lb_types.Point(x=bbox_dim[\"left\"] + bbox_dim[\"width\"], y=bbox_dim[\"top\"] + bbox_dim[\"height\"]), # x= left + width , y = top + height\n",
        "        unit=lb_types.RectangleUnit.POINTS,\n",
        "        page=1\n",
        "    ),\n",
        "    classifications=[\n",
        "    \tlb_types.ClassificationAnnotation(\n",
        "        \tname=\"sub_radio_question\",\n",
        "      \t\tvalue=lb_types.Radio(\n",
        "          answer=lb_types.ClassificationAnswer(\n",
        "            name=\"first_sub_radio_answer\",\n",
        "\n",
        "            classifications=[\n",
        "              lb_types.ClassificationAnnotation(\n",
        "                name=\"second_sub_radio_question\",\n",
        "\n",
        "                value=lb_types.Radio(\n",
        "                  answer=lb_types.ClassificationAnswer(\n",
        "                    name=\"second_sub_radio_answer\"\n",
        "                  )\n",
        "                )\n",
        "              )\n",
        "            ]\n",
        "          )\n",
        "          )\n",
        "        )\n",
        "    ]\n",
        ")\n",
        "\n",
        "ner_with_checklist_subclass_annotation = lb_types.ObjectAnnotation(\n",
        "  name=\"ner_with_checklist_subclass\",\n",
        "  value=lb_types.DocumentEntity(\n",
        "    name=\"ner_with_checklist_subclass\",\n",
        "    text_selections=text_selections_ner\n",
        "  ),\n",
        "  classifications=[\n",
        "    lb_types.ClassificationAnnotation(\n",
        "      name=\"sub_checklist_question\",\n",
        "      value=lb_types.Checklist(\n",
        "      answer=[lb_types.ClassificationAnswer(name=\"first_sub_checklist_answer\")]\n",
        "      )\n",
        "    )\n",
        "  ]\n",
        ")\n"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "7.4 Create the label object "
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "labels = []\n",
        "\n",
        "labels.append(\n",
        "    lb_types.Label(\n",
        "        data=lb_types.DocumentData(\n",
        "            global_key=global_key),\n",
        "        annotations = [\n",
        "            entities_annotation,\n",
        "            checklist_annotation,\n",
        "            nested_checklist_annotation,\n",
        "            text_annotation,\n",
        "            radio_annotation,\n",
        "            nested_radio_annotation,\n",
        "            bbox_annotation,\n",
        "            bbox_with_radio_subclass_annotation,\n",
        "            ner_with_checklist_subclass_annotation\n",
        "        ]\n",
        "  )\n",
        ")"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "7.5 Upload annotations to the project using Label import\n"
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "upload_job_annotation = lb.LabelImport.create_from_objects(\n",
        "    client = client,\n",
        "    project_id = project.uid,\n",
        "    name=\"text_label_import_job\"+ str(uuid.uuid4()),\n",
        "    labels=labels)\n",
        "\n",
        "upload_job_annotation.wait_until_done()\n",
        "# Errors will appear for annotation uploads that failed.\n",
        "print(\"Errors:\", upload_job_annotation.errors)\n",
        "print(\"Status of uploads: \", upload_job_annotation.statuses)"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "7.6 Send the annotations to the Model Run "
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "# get the labels id from the project\n",
        "model_run.upsert_labels(project_id=project.uid)"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    },
    {
      "metadata": {},
      "source": [
        "## Option deletions for cleanup"
      ],
      "cell_type": "markdown"
    },
    {
      "metadata": {},
      "source": [
        "# project.delete()\n",
        "# dataset.delete()"
      ],
      "cell_type": "code",
      "outputs": [],
      "execution_count": null
    }
  ]
}